"""
字体反爬虫
爬取目标站点影片评分、评价人数、票房数据
目标站点：http://www.porters.vip/confusion/movie.html
评分：9.7 => &#xe624.&#xe9c7
评价人数：477.9万 => &#xf593&#xe9c7&#xe9c7.&#xe624万
票房数据: 56.83亿 => &#xea16&#xe339.&#xefd4&#xf19a亿

css:
    font-family: stonefont;
"""
import re
import os
import requests
from fontTools.ttLib import TTFont
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/92.0.4515.159 Safari/537.36 '
}
base_font = {   # todo 定义映射关系，并手动修改字体映射
    'uniE339': '6',
    'uniE624': '9',
    'uniE7DF': '2',
    'uniE9C7': '7',
    'uniEA16': '5',
    'uniEE76': '0',
    'uniEFD4': '8',
    'uniF19A': '3',
    'uniF57B': '1',
    'uniF593': '4'
}
# DIR_NAME = os.path.dirname(os.path.abspath(__name__))
# WOFF_PATH = os.path.join(DIR_NAME, 'movie.woff')
# WOFF_XML_PATH = os.path.join(DIR_NAME, 'movie.xml')
# if not os.path.exists(WOFF_PATH):  # todo 字体文件不存在
#     woff_url = 'http://www.porters.vip/confusion/font/movie.woff'
#     woff_res = requests.get(woff_url, headers=headers)
#     with open('./movie.woff', 'wb') as fp:
#         fp.write(woff_res.content)
#
# font = TTFont(WOFF_PATH)  # todo 打开当前目录的movie.woff文件
# # font.saveXML(WOFF_XML_PATH)     # todo 生成xml文件（字符到字符映射表）
# font_map = font['cmap'].getBestCmap()   # todo 字体的映射关系，字体的映射关系在cmap中体现
# for key in font_map:    # todo 手动修改字体映射
#     font_map[key] = base_font.get(font_map.get(key))
# print(font_map)

url = 'http://www.porters.vip/confusion/movie.html'
res = requests.get(url, headers=headers)
parser = etree.HTMLParser(encoding='utf-8')
text = res.text
for key in base_font:
    # &#xf593&#xe9c7&#xe9c7.&#xe624万 => 477.9万
    clean_key = key.replace('uni', '&#x').lower()
    text = text.replace(clean_key, str(base_font[key]))

html = etree.HTML(text, parser=parser)
# todo 电影名称
name = html.xpath('//*[contains(@class, "movie-brief-container")]//*[contains(@class, "name")]//text()')[0].strip()
# todo 用户评分
score = html.xpath('//*[contains(@class, "score")]//*[contains(@class, "info-num")]//*[contains(@class, "stonefont")]//text()')[0].strip()
# todo 评价人数
star = html.xpath('//*[contains(@class, "score")]//*[contains(@class, "score-num")]//*[contains(@class, "stonefont")]//text()')[0].strip()
# todo 票房
box = "".join(html.xpath('//*[contains(@class, "box")]//text()'))
box = re.sub(re.compile(r'\s'), "", box)
# todo 结果
movie = dict(name=name, score=score, star=star, box=box)
print(movie)